The objective is to perform a sentiment analysis on each of the
interventions of each MP of each of the debates covered and to assign a
column with a tag (positive/negative/neutral) and a score for such
tag.
Sentiment analysis on R
Call the saved dataset in R:
python_df <- read_csv('/Users/lauramartinez/Desktop/Master/TFM/TFM_final/sentiment_python_df.csv')
Add Brexit
results (House of Commons Library, 2017) to the ParlVote
dataframe:
referendum <- read_excel("eureferendum_constituency.xlsx")
referendum <- referendum |>
rename(speaker_constituency = Constituency) |>
select(-`ONS ID`) |>
mutate(brexit = ifelse(leave_perc > 0.5, 1, 0))
In order to perform sentiment analysis and extract conclusions it is
best to select the variables that will be used for the analysis and
create a different dataframe:
# Select variables
brexit_df <- python_df |>
select(date, category, speaker_constituency, party, speech, sentiment, score)
# Add the referendum data
brexit_df <- brexit_df |>
inner_join(referendum, by = "speaker_constituency")
# Ensure the date column is in Date format
brexit_df$date <- as.Date(brexit_df$date)
# Convert sentiment to a factor
brexit_df$sentiment <- factor(brexit_df$sentiment)
Bar plot of sentiment distribution
After the dataframe is finally ready, we might be interested in
exploring the sentiment distribution of the speeches:
# Calculate relative frequencies
brexit_df_relative <- brexit_df |>
group_by(sentiment) |>
summarise(count = n()) |>
mutate(relative_count = count / sum(count))
# Plot the relative distribution of sentiment
ggplot(brexit_df_relative, aes(x = sentiment, y = relative_count, fill = sentiment)) +
geom_bar(stat = "identity", color = "white") +
scale_fill_manual(values = c("neutral" = "cadetblue2", "positive" = "#76EE00", "negative" = "red2")) +
theme_minimal(base_size = 15) + # Increase base font size for readability
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 18), # Bold title, center, increase size
axis.title.x = element_text(face = "bold", size = 14), # Bold x-axis label
axis.title.y = element_text(face = "bold", size = 14), # Bold y-axis label
axis.text = element_text(size = 12), # Size for axis tick labels
legend.title = element_blank(), # Remove legend title for a cleaner look
legend.position = "top", # Position legend at the top
legend.text = element_text(size = 12) # Size for legend text
) +
labs(
title = "Relative Distribution of Sentiment",
x = "Sentiment",
y = "Relative Frequency"
)

Sentiment distribution timeline
Also the distribution of sentiment through time was plotted:
# Calculate relative frequencies
relative_sentiment <- brexit_df |>
group_by(date, sentiment) |>
summarise(count = n()) |>
ungroup() |>
group_by(date) |>
mutate(relative_count = count / sum(count)) |>
ungroup()
# Plot the relative counts over time
ggplot(relative_sentiment, aes(x = date, y = relative_count, fill = sentiment)) +
geom_area(position = "fill", color = "white", alpha = 0.6) + # Use geom_area for stacked plot
scale_fill_manual(values = c("neutral" = "cadetblue2", "positive" = "#76EE00", "negative" = "red2")) +
theme_minimal(base_size = 15) + # Increase base font size for readability
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 18), # Bold title, center, increase size
axis.title.x = element_text(face = "bold", size = 14), # Bold x-axis label
axis.title.y = element_text(face = "bold", size = 14), # Bold y-axis label
axis.text = element_text(size = 12), # Size for axis tick labels
legend.title = element_blank(), # Remove legend title for a cleaner look
legend.position = "top", # Position legend at the top
legend.text = element_text(size = 12) # Size for legend text
) +
labs(
title = "Relative Distribution of Sentiment Over Time",
x = "Date",
y = "Relative Frequency"
)

Average sentiment per topic
Once the categories for each speech have been created and their
sentiment retrieved, let´s see which are the category average sentiment
scores depending on whether the constituency voted Brexit or not. In
this case no date filter is used because we are also interested in
seeing the sentiment generated after the Brexit around each topic.
# 1. Group by category and brexit, then compute the average sentiment score for each sentiment
category_brexit_summary <- brexit_df |>
group_by(category, brexit, sentiment) |>
summarise(avg_score = mean(score, na.rm = TRUE), .groups = 'drop') |>
ungroup()
Create a Brexit and No-Brexit profoile based on the topics
created:
# Filter for the sentiment with the highest avg_score for each category and brexit
top_sentiment_summary <- category_brexit_summary |>
group_by(category, brexit) |>
filter(avg_score == max(avg_score)) |>
ungroup()
And visualize it:
# Define colour palette
sentiment_colors <- c("positive" = "chartreuse2", "neutral" = "cadetblue2", "negative" = "red2")
# Create a bubble chart
ggplot(top_sentiment_summary, aes(x = factor(brexit), y = reorder(category, avg_score), size = avg_score, color = sentiment)) +
geom_point(alpha = 0.7) +
scale_size_continuous(range = c(3, 15)) + # Adjust the range to change bubble sizes
scale_x_discrete(labels = c("0" = "NO", "1" = "YES")) + # Rename the X-Axis labels
scale_color_manual(values = sentiment_colors) + # Apply the color palette
theme_minimal() +
labs(title = "Top Sentiment by Category and Brexit Status",
x = "Brexit Status",
y = "Category",
size = "Average Score",
color = "Sentiment") +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5),
axis.title.y = element_blank(),
plot.title = element_text(size = 14, face = "bold"),
strip.text = element_text(size = 12))

Average sentiment per party
Taking the last party in government before Brexit as the party of
reference
# Ensure party is a factor
brexit_df$party <- as.factor(brexit_df$party)
# Group by category and party, then compute the average sentiment score for each sentiment
category_party_summary2 <- brexit_df |>
filter(format(date, "%Y") == "2016") |>
group_by(category, party, sentiment) |>
summarise(avg_score = mean(score, na.rm = TRUE), .groups = 'drop') |>
ungroup()
# Filter for the sentiment with the highest avg_score for each category and party
party_sentiment_summary <- category_party_summary2 |>
group_by(category, party) |>
filter(avg_score == max(avg_score)) |>
ungroup()
We plot the party average sentiment by topic:
# Create a bubble chart
ggplot(party_sentiment_summary, aes(x = party, y = reorder(category, avg_score), size = avg_score, color = sentiment)) +
geom_point(alpha = 0.7) +
scale_size_continuous(range = c(3, 15)) + # Adjust the range to change bubble sizes
scale_color_manual(values = sentiment_colors) + # Apply the color palette
theme_minimal() +
labs(title = "Top Sentiment by Category and Party",
x = "Party",
y = "Category",
size = "Average Score",
color = "Sentiment") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.y = element_blank(),
plot.title = element_text(size = 14, face = "bold"),
strip.text = element_text(size = 12))

Merge datasets on category to include sentiments of Pro and
Anti-Brexit profiles as a baseline for y-axis lines:
# Merge datasets on category to include sentiments for y-axis lines
merged_data <- party_sentiment_summary |>
left_join(top_sentiment_summary, by = "category", suffix = c("_party", "_axis"), relationship = "many-to-many")
Then the two plots for the Pro-Brexit and Anti-Brexit profiles are
created:
# Create the plot for brexit = 0
plot_brexit_0 <- ggplot(merged_data |> filter(brexit == 0),
aes(x = party, y = reorder(category, avg_score_party), size = avg_score_party, color = sentiment_party)) +
geom_point(alpha = 0.7) +
scale_size_continuous(range = c(3, 15)) +
scale_color_manual(values = sentiment_colors) +
theme_minimal() +
labs(title = "Top Sentiment by Category and Party (NO Brexit)",
x = "Party",
y = "Category",
size = "Average Score",
color = "Sentiment") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.y = element_blank(),
plot.title = element_text(size = 14, face = "bold"),
strip.text = element_text(size = 12)) +
geom_hline(aes(yintercept = category, color = sentiment_axis), size = 0.2)
# Create the plot for brexit = 1
plot_brexit_1 <- ggplot(merged_data |> filter(brexit == 1),
aes(x = party, y = reorder(category, avg_score_party), size = avg_score_party, color = sentiment_party)) +
geom_point(alpha = 0.7) +
scale_size_continuous(range = c(3, 15)) +
scale_color_manual(values = sentiment_colors) +
theme_minimal() +
labs(title = "Top Sentiment by Category and Party (Brexit)",
x = "Party",
y = "Category",
size = "Average Score",
color = "Sentiment") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.y = element_blank(),
plot.title = element_text(size = 14, face = "bold"),
strip.text = element_text(size = 12)) +
geom_hline(aes(yintercept = category, color = sentiment_axis), size = 0.2)
# Print the plots
print(plot_brexit_0)

print(plot_brexit_1)

All results depend on whether I create the average score using
previous data or just 2016 data
Main party governing
I need to first find the pictures of the logos:
Now we will see which party was the one most frequently in power in
Pro-Brexit and Anti-Brexit areas:
#Use as reference party the one governing in 2016
party_brexit_summary <- brexit_df |>
filter(format(date, "%Y") == "2016") |>
distinct(speaker_constituency, brexit, party) |> # Ensure each constituency is counted only once
group_by(brexit, party) |>
summarise(constituency_count = n(), .groups = 'drop') |> # Count unique constituencies per brexit and party
mutate(total_constituencies = n_distinct(brexit_df$speaker_constituency), # Total number of constituencies
relative_freq = constituency_count / total_constituencies) |> # Calculate relative frequency based on total constituencies
arrange(brexit, desc(relative_freq)) |>
group_by(brexit) |>
slice_max(order_by = relative_freq, n = 5, with_ties = FALSE)
print(party_brexit_summary)
## # A tibble: 10 × 5
## # Groups: brexit [2]
## brexit party constituency_count total_constituencies relative_freq
## <dbl> <fct> <int> <int> <dbl>
## 1 0 labour 58 639 0.0908
## 2 0 conservative 57 639 0.0892
## 3 0 scottish-nation… 49 639 0.0767
## 4 0 labourco-operat… 13 639 0.0203
## 5 0 liberal-democrat 6 639 0.00939
## 6 1 conservative 188 639 0.294
## 7 1 labour 112 639 0.175
## 8 1 labourco-operat… 8 639 0.0125
## 9 1 dup 5 639 0.00782
## 10 1 liberal-democrat 2 639 0.00313
We must define the dierctory in which we have the images we
previously downloaded and stored:
logo_dir <- "~/Desktop/Master/TFM/TFM_final" # Change this to your working directory
# Create a dataframe with the names of the parties and the corresponding paths to the logo files
party_logos <- data.frame(
party = c("scottish-national-party", "labour", "conservative", "labourco-operative", "liberal-democrat", "dup"),
logo = file.path(logo_dir, c(
"scottish-national-party.png",
"labour.png",
"conservative.png",
"labourco-operative.png",
"liberal-democrat.png",
"dup.png"
))
)
# Create a function to resize the images
resize_image <- function(image_path, width = 120, height = 80) {
# Read the image from the specified file path
image <- image_read(image_path)
# Resize the image to the specified W & H, forcing the exact dimensions
image <- image_resize(image, paste0(width, "x", height, "!"))
# Create a temporary file with a .png extension to store the resized image
temp_file <- tempfile(fileext = ".png")
# Write the resized image to the temporary file in PNG format
image_write(image, path = temp_file, format = "png")
# Return the path to the temporary file containing the resized image
return(temp_file)
}
party_logos$logo <- sapply(party_logos$logo, resize_image)
# Merge the logos df with party_brexit_summary
party_brexit_summary_with_logos <- party_brexit_summary |>
left_join(party_logos, by = "party")
Prepare the data for the two different sides (pro-Brexit and
anti-Brexit) and set a colour palette for the most common parties in the
House of Commons:
# Filter data for Brexit and No Brexit plots
brexit_data <- party_brexit_summary_with_logos |> filter(brexit == 1)
no_brexit_data <- party_brexit_summary_with_logos |> filter(brexit == 0)
party_colors <- c(
"labour" = "red3",
"conservative" = "dodgerblue3",
"scottish-national-party" = "lemonchiffon",
"labourco-operative" = "darkmagenta",
"dup" = "darksalmon",
"liberal-democrat" = "goldenrod1"
)
Plot the results:
# Determine the common y-axis limit
y_max <- max(c(max(brexit_data$relative_freq+ 0.03), max(no_brexit_data$relative_freq)))
# Create Brexit plot with fixed y-axis limits
brexit_plot <- ggplot(brexit_data, aes(x = reorder(party, relative_freq), y = relative_freq, fill = party)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7) +
geom_image(aes(image = logo, y = relative_freq + 0.015), position = position_dodge(width = 0.7), size = 0.1) +
scale_fill_manual(values = party_colors) +
theme_minimal() +
labs(
title = "",
x = "Brexit areas",
y = "Proportion of Electoral representation in Brexit districts"
) +
theme(
axis.text.x = element_blank(),
axis.title.x = element_text(size = 12, face = "bold"),
legend.position = "none",
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed"),
panel.grid.minor.y = element_line(linetype = "dashed")
) +
ylim(0, y_max)
# Create No Brexit plot with fixed y-axis limits
no_brexit_plot <- ggplot(no_brexit_data, aes(x = reorder(party, relative_freq), y = relative_freq, fill = party)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7) +
geom_image(aes(image = logo, y = relative_freq + 0.015), position = position_dodge(width = 0.7), size = 0.1) +
scale_fill_manual(values = party_colors) +
theme_minimal() +
labs(
title = "",
x = "No Brexit areas",
y = ""
) +
theme(
axis.text.x = element_blank(),
axis.title.x = element_text(size = 12, face = "bold"),
legend.position = "none",
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed"),
panel.grid.minor.y = element_line(linetype = "dashed")
) +
ylim(0, y_max)
# Manually create the legend
legend_plot <- ggplot(data.frame(party = names(party_colors), count = rep(1, length(party_colors))),
aes(x = party, y = count, fill = party)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = party_colors) +
theme_void() +
theme(
legend.position = "right",
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 10)
) +
guides(fill = guide_legend(title = "Party"))
# Extract the legend as a separate plot
legend <- ggplotGrob(legend_plot)$grobs[[which(sapply(ggplotGrob(legend_plot)$grobs, function(x) x$name) == "guide-box")]]
# Combine the two plots with a common title
combined_plot <- plot_grid(
brexit_plot, no_brexit_plot,
ncol = 2, rel_widths = c(1, 1)
)
# Add the common title and combine with the legend
final_plot <- plot_grid(
ggdraw() +
draw_plot(combined_plot, 0.1, 0, 0.9, 0.9) +
draw_label("Top 5 Parties by Brexit Result in 2016", x = 0.6, y = 0.93, fontface = 'bold', size = 20),
legend,
ncol = 2,
rel_widths = c(0.85, 0.15)
)
# Display the final plot
print(final_plot)

Correlation between the main party governing before Brexit and score
on each of the topics
I will create a score for each topic in terms of brexit or No brexit.
Then see the impact of having one party governing in a constituency or
another in showing a profile closer to the Brexit or No brexit
status.
Sentiment needs to be from 2013 to capture the ideas/c concepts of
the parties regarding brexit but only the representation in 2016 (party
in power in 2016) might have a correltion. This can be maybe because
even the electoral win of those MPs in certain areas was forecoming the
brexit results as well.
First: Average sentiment per topic per party:
# Calculate average sentiment per topic per party
avg_sentiment <- brexit_df |>
group_by(category, party, sentiment) |>
summarize(avg_score = mean(score, na.rm = TRUE)) |>
ungroup()
# Select the sentiment with the highest score for each topic per party
max_sentiment <- avg_sentiment |>
group_by(category, party) |>
filter(avg_score == max(avg_score)) |>
ungroup() |>
distinct(category, party, .keep_all = TRUE)
Filter for 2016 and Calculate Average Leave Percentage for each party
based on brexit status
# Filter for the year 2016 and select relevant columns
df_2016 <- brexit_df |>
filter(format(as.Date(date, format="%Y-%m-%d"), "%Y") == "2016") |>
select(party, brexit, leave_perc)
# Calculate the average leave percentage for each party based on Brexit status
avg_leave_perc <- df_2016 |>
group_by(party, brexit) |>
summarize(avg_leave_perc = mean(leave_perc, na.rm = TRUE)) |>
ungroup()
Merge data frames:
# Merge the two data frames on the party column
final_df <- left_join(max_sentiment, avg_leave_perc, by = "party", relationship = "many-to-many")
# Display the resulting dataframe structure
str(final_df)
## tibble [218 × 6] (S3: tbl_df/tbl/data.frame)
## $ category : chr [1:218] "Crime/Law" "Crime/Law" "Crime/Law" "Crime/Law" ...
## $ party : Factor w/ 14 levels "alliance","conservative",..: 2 2 4 5 6 6 7 7 8 8 ...
## $ sentiment : Factor w/ 3 levels "negative","neutral",..: 3 3 2 1 3 3 2 2 3 3 ...
## $ avg_score : num [1:218] 0.668 0.668 0.749 0.638 0.627 ...
## $ brexit : num [1:218] 0 1 0 0 0 1 0 1 0 1 ...
## $ avg_leave_perc: num [1:218] 0.421 0.581 0.259 0.426 0.375 ...
Test the correlation/causation:
Let´s carry out a few ANOVA preliminary tests:
To see if there are any significant differences between between
different parties in the average (leave) Brexit result:
# Convert party to a factor
final_df$party <- as.factor(final_df$party)
# Perform ANOVA
anova_party <- aov(avg_leave_perc ~ party, data = final_df)
summary(anova_party)
## Df Sum Sq Mean Sq F value Pr(>F)
## party 10 1.135 0.11348 19.96 <2e-16 ***
## Residuals 196 1.114 0.00569
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 11 observations deleted due to missingness
And plot the results in a table:
# Extract the ANOVA summary as a data frame
anova_summary <- summary(anova_party)[[1]]
# Convert the summary table into a data frame with meaningful row names
anova_df <- data.frame(
Term = c("party", "Residuals"),
Df = anova_summary[, "Df"],
`Sum Sq` = round(anova_summary[, "Sum Sq"], 3), # Rounded for better display
`Mean Sq` = round(anova_summary[, "Mean Sq"], 5),
`F value` = round(anova_summary[, "F value"], 2),
`Pr(>F)` = format.pval(anova_summary[, "Pr(>F)"], digits = 3, eps = 0.001)
)
# Add significance codes manually
anova_df$Signif <- c("***", "")
# Create the table using gt
anova_gt_table <- anova_df |>
gt() |> # Convert the data frame into a gt table object
tab_header(
title = "ANOVA Summary Table" # Add a title to the table
) |>
cols_label(
Term = "Term", # Rename the 'Term' column
Df = "Degrees of Freedom", # Rename the 'Df' column
Sum.Sq = "Sum of Squares", # Correctly rename the 'Sum.Sq' column
Mean.Sq = "Mean Square", # Correctly rename the 'Mean.Sq' column
F.value = "F Value", # Correctly rename the 'F.value' column
Pr..F. = "p-value", # Correctly rename the 'Pr..F.' column
Signif = "Signif. Codes" # Rename the 'Signif' column
) |>
tab_style(
style = cell_text(weight = "bold"), # Bold style for the column headers
locations = cells_column_labels(everything())
) |>
tab_footnote(
# Add a footnote explaining significance codes
footnote = "Significance codes: *** p < 0.001",
# Place the footnote under the 'Signif. Codes' header
locations = cells_column_labels("Signif")
)
# Display the table
anova_gt_table # Render the gt table in the output
| ANOVA Summary Table |
| Term |
Degrees of Freedom |
Sum of Squares |
Mean Square |
F Value |
p-value |
Signif. Codes |
| party |
10 |
1.135 |
0.11348 |
19.96 |
<0.001 |
*** |
| Residuals |
196 |
1.114 |
0.00569 |
NA |
NA |
|
To determine if the sentiments expressed in speeches of each category
by each party have a significant direct effect on the average leave
percentage (i.e., testing for a triple interaction) we can do a
multi-way ANOVA
final_df$sentiment <- as.factor(final_df$sentiment)
final_df$category <- as.factor(final_df$category)
# Perform a multi-way ANOVA with triple interaction
anova_multi <- aov(avg_leave_perc ~ party * sentiment * category, data = final_df)
summary(anova_multi)
## Df Sum Sq Mean Sq F value Pr(>F)
## party 10 1.135 0.11348 8.35 3.33e-09 ***
## sentiment 2 0.000 0.00000 0.00 1
## category 11 0.000 0.00000 0.00 1
## party:sentiment 19 0.000 0.00000 0.00 1
## party:category 82 0.000 0.00000 0.00 1
## Residuals 82 1.114 0.01359
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 11 observations deleted due to missingness
And plot the table:
# Extract the ANOVA summary as a data frame
anova_multi_summary <- summary(anova_multi)[[1]]
# Convert the summary table into a data frame with meaningful row names
anova_multi_df <- data.frame(
Term = rownames(anova_multi_summary),
Df = anova_multi_summary[, "Df"],
`Sum Sq` = round(anova_multi_summary[, "Sum Sq"], 3), # Rounded for better display
`Mean Sq` = round(anova_multi_summary[, "Mean Sq"], 5),
`F value` = round(anova_multi_summary[, "F value"], 2),
`Pr(>F)` = format.pval(anova_multi_summary[, "Pr(>F)"], digits = 3, eps = 0.001)
)
# Add significance codes manually based on the results
anova_multi_df$Signif <- c("***", "", "", "", "", "")
# Create the table using gt
anova_multi_gt_table <- anova_multi_df |>
gt() |> # Convert the data frame into a gt table object
tab_header(
title = "Multi-Way ANOVA Summary Table" # Add a title to the table
) |>
cols_label(
Term = "Term", # Rename the 'Term' column
Df = "Degrees of Freedom", # Rename the 'Df' column
Sum.Sq = "Sum of Squares", # Correctly rename the 'Sum.Sq' column
Mean.Sq = "Mean Square", # Correctly rename the 'Mean.Sq' column
F.value = "F Value", # Correctly rename the 'F.value' column
Pr..F. = "p-value", # Correctly rename the 'Pr..F.' column
Signif = "Signif. Codes" # Rename the 'Signif' column
) |>
tab_style(
style = cell_text(weight = "bold"), # Bold style for the column headers
locations = cells_column_labels(everything()) # Apply to all column headers
) |>
tab_footnote(
# Add a footnote explaining significance codes
footnote = "Significance codes: *** p < 0.001",
# Place the footnote under the 'Signif. Codes' header
locations = cells_column_labels("Signif")
)
# Display the table
anova_multi_gt_table # Render the gt table in the output
| Multi-Way ANOVA Summary Table |
| Term |
Degrees of Freedom |
Sum of Squares |
Mean Square |
F Value |
p-value |
Signif. Codes |
| party |
10 |
1.135 |
0.11348 |
8.35 |
<0.001 |
*** |
| sentiment |
2 |
0.000 |
0.00000 |
0.00 |
1 |
|
| category |
11 |
0.000 |
0.00000 |
0.00 |
1 |
|
| party:sentiment |
19 |
0.000 |
0.00000 |
0.00 |
1 |
|
| party:category |
82 |
0.000 |
0.00000 |
0.00 |
1 |
|
| Residuals |
82 |
1.114 |
0.01359 |
NA |
NA |
|
The results indicate that while the party of the speaker
significantly influences the average leave percentage, the combined
effects of party, sentiment, and category do not have a significant
impact. This suggests that the influence of sentiments expressed in
speeches of each category by each party does not significantly vary the
average leave percentage.
Also I would like to look at this last relationship with further
detail before jumping into a regression analysis. Correlation
Analysis for Each Type of Sentiment:
# Create a function to calculate correlation for each sentiment
calculate_correlation <- function(df, sentiment_type) {
subset_df <- df |> filter(sentiment == sentiment_type)
cor(subset_df$avg_leave_perc, subset_df$avg_score, use = "complete.obs")
}
# Calculate correlation for each sentiment type
sentiment_types <- levels(final_df$sentiment)
correlations <- sapply(sentiment_types, function(sentiment) {
calculate_correlation(final_df, sentiment)
})
# Print correlations for each sentiment type
names(correlations) <- sentiment_types
print(correlations)
## negative neutral positive
## -0.28883771 -0.01732026 0.02238975
And plot this:
# Convert the correlations to a data frame
correlations_df <- data.frame(
sentiment = names(correlations),
correlation = correlations
)
# Create a dot plot
ggplot(correlations_df, aes(x = sentiment, y = correlation, color = sentiment)) +
geom_point(size = 5) + # Create dots with a specified size
geom_hline(yintercept = 0, linetype = "dashed", color = "gray") + # Add a dashed line at y = 0 for reference
theme_minimal(base_size = 15) + # Minimal theme with increased base size
labs(
title = "Correlation Between Avg Leave % and Avg Score by Sentiment Type",
x = "Sentiment Type",
y = "Correlation"
) +
geom_text(aes(label = round(correlation, 3)), vjust = -1, size = 5) + # Add correlation values next to dots
scale_color_manual(values = c("negative" = "red3", "neutral" = "gray", "positive" = "springgreen")) +
ylim(-0.35, 0.1) + # Adjust y-axis limits to add space at the top
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 16), # Bold, centered title
axis.title.x = element_text(face = "bold", size = 14), # Bold x-axis label
axis.title.y = element_text(face = "bold", size = 14), # Bold y-axis label
axis.text = element_text(size = 12), # Size for axis tick labels
legend.position = "none", # Remove legend
plot.margin = margin(t = 20, r = 20, b = 30, l = 20), # Adjust plot margins to add space around the plot
axis.text.x = element_text(margin = margin(t = 10)) # Add space below the x-axis text labels
)

Linear regression model:
# Remove rows with NA values before running regression
final_df_complete <- final_df |>
drop_na()
# Relevel the party factor to set a different reference category *** CAREFUL = NAs!!!
final_df$party <- relevel(final_df$party, ref = "ukip") #party that supported brexit the most
# Fit a linear regression model
model <- lm(avg_leave_perc ~ party + category*sentiment , data = final_df)
summary(model)
##
## Call:
## lm(formula = avg_leave_perc ~ party + category * sentiment, data = final_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11106 -0.07906 0.00000 0.07906 0.11106
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error
## (Intercept) 5.008e-01 8.942e-02
## partydup 2.249e-02 2.633e-02
## partygreen -2.415e-01 3.353e-02
## partyindependent -7.480e-02 3.342e-02
## partylabour -1.479e-02 2.664e-02
## partylabourco-operative 7.596e-03 2.649e-02
## partyliberal-democrat -1.501e-02 2.672e-02
## partyplaid-cymru -3.772e-02 2.818e-02
## partyscottish-national-party -4.956e-02 2.591e-02
## partysocial-democratic-and-labour-party -2.324e-01 3.389e-02
## partyuup -8.742e-02 3.901e-02
## categoryDefense/Foreign Affairs 2.468e-15 1.236e-01
## categoryEconomy/Technology 1.318e-15 1.010e-01
## categoryEducation 1.326e-15 1.009e-01
## categoryEmployment 1.367e-15 1.006e-01
## categoryEnergy/Environment 1.402e-15 9.122e-02
## categoryEU/Brexit 1.732e-15 1.235e-01
## categoryHealth 1.464e-15 9.585e-02
## categoryHousing 1.382e-15 9.444e-02
## categoryParliamentary proceedings -3.882e-16 4.267e-02
## categorySocial Security/Society/Welfare 1.475e-15 9.347e-02
## categoryTransport 1.389e-15 9.559e-02
## sentimentneutral 1.956e-15 9.774e-02
## sentimentpositive 1.805e-15 9.263e-02
## categoryDefense/Foreign Affairs:sentimentneutral -3.099e-15 1.358e-01
## categoryEconomy/Technology:sentimentneutral -1.739e-15 1.182e-01
## categoryEducation:sentimentneutral -1.866e-15 1.164e-01
## categoryEmployment:sentimentneutral -2.119e-15 1.459e-01
## categoryEnergy/Environment:sentimentneutral -1.908e-15 1.096e-01
## categoryEU/Brexit:sentimentneutral -2.398e-15 1.601e-01
## categoryHealth:sentimentneutral -2.047e-15 1.126e-01
## categoryHousing:sentimentneutral -1.830e-15 1.217e-01
## categoryParliamentary proceedings:sentimentneutral -1.419e-16 6.778e-02
## categorySocial Security/Society/Welfare:sentimentneutral -1.978e-15 1.161e-01
## categoryTransport:sentimentneutral -1.866e-15 1.225e-01
## categoryDefense/Foreign Affairs:sentimentpositive -2.937e-15 1.317e-01
## categoryEconomy/Technology:sentimentpositive -1.743e-15 1.092e-01
## categoryEducation:sentimentpositive -1.698e-15 1.101e-01
## categoryEmployment:sentimentpositive -1.740e-15 1.080e-01
## categoryEnergy/Environment:sentimentpositive -1.822e-15 1.052e-01
## categoryEU/Brexit:sentimentpositive -2.143e-15 1.291e-01
## categoryHealth:sentimentpositive -1.827e-15 1.073e-01
## categoryHousing:sentimentpositive -1.775e-15 1.046e-01
## categoryParliamentary proceedings:sentimentpositive NA NA
## categorySocial Security/Society/Welfare:sentimentpositive -1.887e-15 1.039e-01
## categoryTransport:sentimentpositive -1.768e-15 1.043e-01
## t value Pr(>|t|)
## (Intercept) 5.600 8.97e-08 ***
## partydup 0.854 0.3943
## partygreen -7.202 2.11e-11 ***
## partyindependent -2.238 0.0266 *
## partylabour -0.555 0.5795
## partylabourco-operative 0.287 0.7747
## partyliberal-democrat -0.562 0.5751
## partyplaid-cymru -1.338 0.1827
## partyscottish-national-party -1.913 0.0575 .
## partysocial-democratic-and-labour-party -6.858 1.40e-10 ***
## partyuup -2.241 0.0264 *
## categoryDefense/Foreign Affairs 0.000 1.0000
## categoryEconomy/Technology 0.000 1.0000
## categoryEducation 0.000 1.0000
## categoryEmployment 0.000 1.0000
## categoryEnergy/Environment 0.000 1.0000
## categoryEU/Brexit 0.000 1.0000
## categoryHealth 0.000 1.0000
## categoryHousing 0.000 1.0000
## categoryParliamentary proceedings 0.000 1.0000
## categorySocial Security/Society/Welfare 0.000 1.0000
## categoryTransport 0.000 1.0000
## sentimentneutral 0.000 1.0000
## sentimentpositive 0.000 1.0000
## categoryDefense/Foreign Affairs:sentimentneutral 0.000 1.0000
## categoryEconomy/Technology:sentimentneutral 0.000 1.0000
## categoryEducation:sentimentneutral 0.000 1.0000
## categoryEmployment:sentimentneutral 0.000 1.0000
## categoryEnergy/Environment:sentimentneutral 0.000 1.0000
## categoryEU/Brexit:sentimentneutral 0.000 1.0000
## categoryHealth:sentimentneutral 0.000 1.0000
## categoryHousing:sentimentneutral 0.000 1.0000
## categoryParliamentary proceedings:sentimentneutral 0.000 1.0000
## categorySocial Security/Society/Welfare:sentimentneutral 0.000 1.0000
## categoryTransport:sentimentneutral 0.000 1.0000
## categoryDefense/Foreign Affairs:sentimentpositive 0.000 1.0000
## categoryEconomy/Technology:sentimentpositive 0.000 1.0000
## categoryEducation:sentimentpositive 0.000 1.0000
## categoryEmployment:sentimentpositive 0.000 1.0000
## categoryEnergy/Environment:sentimentpositive 0.000 1.0000
## categoryEU/Brexit:sentimentpositive 0.000 1.0000
## categoryHealth:sentimentpositive 0.000 1.0000
## categoryHousing:sentimentpositive 0.000 1.0000
## categoryParliamentary proceedings:sentimentpositive NA NA
## categorySocial Security/Society/Welfare:sentimentpositive 0.000 1.0000
## categoryTransport:sentimentpositive 0.000 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08294 on 162 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.5045, Adjusted R-squared: 0.37
## F-statistic: 3.749 on 44 and 162 DF, p-value: 4.913e-10
Many of the interaction terms between category and
sentiment are not significant, with p-values of 1.0000,
indicating no effect on the outcome variable.
Yet we might want to see some visualizations of the analysis:
library(ggplot2)
library(dplyr)
# Filter significant coefficients (p < 0.05)
significant_coefs <- broom::tidy(model) |>
filter(p.value < 0.05)
# Interaction plot between category and sentiment
interaction_plot <- interaction.plot(
final_df$sentiment, final_df$category, final_df$avg_leave_perc,
col = c("red", "green", "blue"), legend = TRUE,
ylab = "Avg Leave %", xlab = "Sentiment",
trace.label = "Category"
)

# Coefficients plot:
# Tidy the model output
tidy_model <- tidy(model, conf.int = TRUE)
# Filter out the intercept for clarity and remove rows with missing estimates
tidy_model <- tidy_model |>
filter(term != "(Intercept)") |>
filter(!is.na(estimate))
# Highlight significant variables
tidy_model <- tidy_model |>
mutate(term_label = ifelse(p.value < 0.05, paste0("<b>", term, "</b>"), term))
# Create the plot
highlighted_plot <- ggplot(tidy_model, aes(x = reorder(term_label, estimate), y = estimate)) +
geom_point(aes(color = p.value < 0.05), size = 3) + # Color points by significance
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
coord_flip() + # Flip the coordinates for better readability
labs(
title = "Coefficient Plot of Linear Regression Model",
x = "Predictors",
y = "Coefficient Estimate",
color = "Significant (p < 0.05)"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.title = element_text(size = 14),
axis.text.y = element_markdown(size = 12), # Use element_markdown to support HTML tags
axis.text.x = element_text(size = 12),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)
)
print(highlighted_plot)

# Residuals vs Fitted Plot
plot(model, which = 1) # This will give a residuals vs fitted plot

# Q-Q Plot
plot(model, which = 2) # This will give a Q-Q plot

# Cook's Distance Plot
plot(model, which = 4) # This will give a Cook's distance plot

The Green Party, Independent Party, Social Democratic and Labour
Party, and UUP Party all have significant negative influences. Check
party influence alone:
model2 <- lm(avg_leave_perc ~ party, data = final_df)
summary(model2)
##
## Call:
## lm(formula = avg_leave_perc ~ party, data = final_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11106 -0.07906 0.00000 0.07906 0.11106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.500808 0.015392 32.537 < 2e-16
## partydup 0.022488 0.022257 1.010 0.31356
## partygreen -0.241472 0.026660 -9.058 < 2e-16
## partyindependent -0.074799 0.026660 -2.806 0.00553
## partylabour -0.014794 0.021767 -0.680 0.49754
## partylabourco-operative 0.007596 0.021767 0.349 0.72751
## partyliberal-democrat -0.015008 0.021767 -0.689 0.49135
## partyplaid-cymru -0.037720 0.022257 -1.695 0.09171
## partyscottish-national-party -0.049562 0.021767 -2.277 0.02387
## partysocial-democratic-and-labour-party -0.232427 0.027456 -8.466 5.92e-15
## partyuup -0.087417 0.030784 -2.840 0.00499
##
## (Intercept) ***
## partydup
## partygreen ***
## partyindependent **
## partylabour
## partylabourco-operative
## partyliberal-democrat
## partyplaid-cymru .
## partyscottish-national-party *
## partysocial-democratic-and-labour-party ***
## partyuup **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0754 on 196 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.5045, Adjusted R-squared: 0.4793
## F-statistic: 19.96 on 10 and 196 DF, p-value: < 2.2e-16
11 missing values because the parties “alliance”, “ukip”, and
“respect” did not participate in all debates regarding all the topics
and thus some NAs are generated
Sentiment evolution through time
brexit_eu_brexit <- brexit_df |>
mutate(date = as.Date(date)) |>
filter(category == "EU/Brexit") |>
group_by(date, sentiment) |>
summarize(avg_score = mean(score, na.rm = TRUE), .groups = 'drop')
library(ggplot2)
library(plotly)
# Define custom colors
line_colors <- c("negative" = "coral", "positive" = "darkolivegreen2", "neutral" = "darkslategray2")
smooth_colors <- c("red3", "darkgreen", "deepskyblue3")
# Ensure the date column is in Date format
brexit_eu_brexit$date <- as.Date(brexit_eu_brexit$date)
# Create the plot with geom_line
plot <- ggplot(brexit_eu_brexit, aes(x = date, y = avg_score, group = sentiment)) +
geom_line(aes(color = sentiment), size = 0.5) +
scale_color_manual(values = line_colors, name = "Sentiment") +
labs(title = "Evolution of Average Sentiment Score for EU-Brexit",
x = "Date",
y = "Average Sentiment Score") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
) +
scale_x_date(date_labels = "%Y", date_breaks = "1 year")
# Add the geom_smooth layers separately with manual colors
for (i in seq_along(smooth_colors)) {
sentiment_type <- unique(brexit_eu_brexit$sentiment)[i]
plot <- plot +
geom_smooth(data = subset(brexit_eu_brexit, sentiment == sentiment_type),
aes(x = date, y = avg_score),
method = "loess",
se = FALSE,
linetype = "dashed",
size = 1,
color = smooth_colors[i])
}
# Add a vertical line for 23 June 2016
plot <- plot +
geom_vline(xintercept = as.Date("2016-06-23"), color = "red", size = 1, linetype = "solid")
plot

We can also have in html an interactive version of the plot
# Make the plot interactive
interactive_plot <- ggplotly(plot)
interactive_plot